We loop trough the folders containing the filtered matrix files, barcodes and genenames to read in the data in form of SingleCellExperiment (sce) objects.
library(scran)
library(scater)
library(DropletUtils)
library(ggplot2)
library(Matrix)
library(cowplot)
library(openxlsx)
MT_genes <- read.table("~/Dropbox/Postdoc/2019-12-29_BE2020/Additional_files/MT_genes.txt", sep = "\t", header = TRUE)
folders <- list.files("~/Dropbox/Postdoc/2019-12-29_BE2020/Data_v3/", full.names = TRUE)
folder.names <- list.files("~/Dropbox/Postdoc/2019-12-29_BE2020/Data_v3/", full.names = FALSE)
# Initialization of list to store sce objects
# Uncommand this when no other list exists
sce.list <- list()
# If other samples have already been analysed, load in list containing previous samples
# sce.list <- readRDS("~/Dropbox/Postdoc/2019-10-29_Gastric_IM/All_sce.rds")
# Read in the data
for(i in 1:length(folders)){
if(folder.names[i] %in% names(sce.list)){next}
else{
cur_sce <- read10xCounts(paste0(folders[i],"/outs/filtered_feature_bc_matrix/"))
colData(cur_sce)$Patient <- as.character(sapply(folder.names,
function(n){unlist(strsplit(n, "_"))[1]})[i])
colData(cur_sce)$Tissue <- as.character(sapply(folder.names,
function(n){unlist(strsplit(n, "_"))[3]})[i])
sce.list[[folder.names[i]]] <- cur_sce
}
}
# Save list
saveRDS(sce.list, "~/Dropbox/Postdoc/2019-12-29_BE2020/All_sce_unfiltered.rds")
Next we filter the cells based on several criteria. For this, we will plot these QC features first.
#read in the file if running only this chunk
##sce.list <- readRDS("~/Dropbox/Postdoc/2019-10-29_Gastric_IM///All_sce_unfiltered.rds")
for(i in 1:length(sce.list)){
cur_sce <- sce.list[[i]]
if(!("total_features_by_counts" %in% names(colData(cur_sce)))){
cur_sce <- suppressMessages(calculateQCMetrics(cur_sce))
sce.list[[i]] <- cur_sce
}
# Library size
print(ggplot(as.data.frame(colData(cur_sce))) +
geom_point(aes(1:ncol(cur_sce), log10(total_counts))) +
xlab("Cells")+ggtitle(names(sce.list)[i]))
# Number of genes detected
print(ggplot(as.data.frame(colData(cur_sce))) +
geom_point(aes(1:ncol(cur_sce), total_features_by_counts)) +
xlab("Cells")+ggtitle(names(sce.list)[i]))
# Mitochondrial genes
print(ggplot(data.frame(MT_genes = Matrix::colSums(counts(cur_sce)[rownames(cur_sce) %in%
MT_genes$Gene.stable.ID,])/
Matrix::colSums(counts(cur_sce)))*100) +
geom_point(aes(1:ncol(cur_sce), MT_genes)) +
ylab("% mitochondrial reads") + xlab("Cells")+ggtitle(names(sce.list)[i]))
# Marker gene expression
# VIM
print(ggplot(data.frame(VIM = log10(counts(cur_sce)["ENSG00000026025",] + 1))) +
geom_point(aes(1:ncol(cur_sce), VIM)) + xlab("Cells") +
ylab("log10[VIM]")+ggtitle(names(sce.list)[i]))
# PTPRC
print(ggplot(data.frame(PTPRC = log10(counts(cur_sce)["ENSG00000081237",] + 1))) +
geom_point(aes(1:ncol(cur_sce), PTPRC)) + xlab("Cells") +
ylab("log10[PTPRC]")+ggtitle(names(sce.list)[i]))
}
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
# Create xlsx file with entries for QC thresholds
df <- data.frame(names = names(sce.list),
lower_total_counts = rep(0, length(sce.list)),
upper_total_counts = rep(0, length(sce.list)),
lower_total_features = rep(0, length(sce.list)),
upper_total_features = rep(0, length(sce.list)),
lower_mito = rep(0, length(sce.list)),
upper_mito = rep(0, length(sce.list)),
VIM = rep(0, length(sce.list)),
PTPRC = rep(0, length(sce.list)))
write.xlsx(df, "~/Dropbox/Postdoc/2019-12-29_BE2020/Additional_files/QC_metrics.xlsx")
saveRDS(sce.list, "~/Dropbox/Postdoc/2019-12-29_BE2020/All_sce_unfiltered.rds")
Here we remove cells based on the filtering thresholds defined by visualizing the QC features.
QC_thresholds <- read.xlsx("~/Dropbox/Postdoc/2019-12-29_BE2020/Additional_files/QC_metrics_edited.xlsx")
# sce.list <- readRDS("~/Dropbox/Postdoc/2019-10-29_Gastric_IM///All_sce_unfiltered.rds")
for(i in 1:length(sce.list)){
cur_sce <- sce.list[[i]]
# Total counts
cur_sce <- cur_sce[,log10(colData(cur_sce)$total_counts) >
QC_thresholds[i,"lower_total_counts"] &
log10(colData(cur_sce)$total_counts) <
QC_thresholds[i,"upper_total_counts"]]
# Total features
cur_sce <- cur_sce[,colData(cur_sce)$total_features_by_counts >
QC_thresholds[i,"lower_total_features"] &
colData(cur_sce)$total_features_by_counts <
QC_thresholds[i,"upper_total_features"]]
# Mitochondria
cur_sce <- cur_sce[,(Matrix::colSums(counts(cur_sce)[rownames(cur_sce) %in%
MT_genes$Gene.stable.ID,])/Matrix::colSums(counts(cur_sce)))*100 >
QC_thresholds[i,"lower_mito"] &
(Matrix::colSums(counts(cur_sce)[rownames(cur_sce) %in%
MT_genes$Gene.stable.ID,])/Matrix::colSums(counts(cur_sce)))*100 <
QC_thresholds[i,"upper_mito"]]
# VIM
if(!is.na(QC_thresholds[i,"VIM"])){
cur_sce <- cur_sce[,counts(cur_sce)["ENSG00000026025",] <=
QC_thresholds[i,"VIM"]]
}
# PTPRC
if(!is.na(QC_thresholds[i,"PTPRC"])){
cur_sce <- cur_sce[,counts(cur_sce)["ENSG00000081237",] <=
QC_thresholds[i,"PTPRC"]]
}
sce.list[[i]] <- cur_sce
}
saveRDS(sce.list, "~/Dropbox/Postdoc/2019-12-29_BE2020/All_sce_filtered.rds")
To finish get session info:
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 31 (Workstation Edition)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] openxlsx_4.1.4 cowplot_1.0.0
## [3] Matrix_1.2-18 DropletUtils_1.6.1
## [5] scater_1.14.6 ggplot2_3.2.1
## [7] scran_1.14.6 SingleCellExperiment_1.8.0
## [9] SummarizedExperiment_1.16.1 DelayedArray_0.12.2
## [11] BiocParallel_1.20.1 matrixStats_0.55.0
## [13] Biobase_2.46.0 GenomicRanges_1.38.0
## [15] GenomeInfoDb_1.22.0 IRanges_2.20.2
## [17] S4Vectors_0.24.3 BiocGenerics_0.32.0
## [19] rmarkdown_2.1
##
## loaded via a namespace (and not attached):
## [1] viridis_0.5.1 edgeR_3.28.0 BiocSingular_1.2.1
## [4] viridisLite_0.3.0 DelayedMatrixStats_1.8.0 R.utils_2.9.2
## [7] assertthat_0.2.1 statmod_1.4.33 dqrng_0.2.1
## [10] GenomeInfoDbData_1.2.2 vipor_0.4.5 yaml_2.2.1
## [13] pillar_1.4.3 lattice_0.20-38 glue_1.3.1
## [16] limma_3.42.2 digest_0.6.24 XVector_0.26.0
## [19] colorspace_1.4-1 htmltools_0.4.0 R.oo_1.23.0
## [22] pkgconfig_2.0.3 zlibbioc_1.32.0 purrr_0.3.3
## [25] scales_1.1.0 HDF5Array_1.14.2 tibble_2.1.3
## [28] farver_2.0.3 withr_2.1.2 lazyeval_0.2.2
## [31] magrittr_1.5 crayon_1.3.4 evaluate_0.14
## [34] R.methodsS3_1.8.0 beeswarm_0.2.3 tools_3.6.2
## [37] formatR_1.7 lifecycle_0.1.0 stringr_1.4.0
## [40] Rhdf5lib_1.8.0 munsell_0.5.0 locfit_1.5-9.1
## [43] zip_2.0.4 irlba_2.3.3 compiler_3.6.2
## [46] rsvd_1.0.2 rlang_0.4.4 rhdf5_2.30.1
## [49] grid_3.6.2 RCurl_1.98-1.1 BiocNeighbors_1.4.1
## [52] igraph_1.2.4.2 labeling_0.3 bitops_1.0-6
## [55] gtable_0.3.0 R6_2.4.1 gridExtra_2.3
## [58] knitr_1.28 dplyr_0.8.4 stringi_1.4.5
## [61] ggbeeswarm_0.6.0 Rcpp_1.0.3 tidyselect_1.0.0
## [64] xfun_0.12